{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "from data_process import build_vocab, batch_iter, sentence_to_onehot, cal_idf, sentence_to_tfidf\n",
    "from models import logistic_regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('./data/train-5T.txt', delimiter='\\t')\n",
    "test = pd.read_csv('./data/test-1T.txt', delimiter='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train.document\n",
    "Y_train = train.label\n",
    "X_test = test.document\n",
    "Y_test = test.label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_vocab = 50000\n",
    "vocab, _, vocab_size = build_vocab(X_train, max_vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis with logistic regression using one_hot encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = tf.ConfigProto()\n",
    "config.gpu_options.allow_growth = True\n",
    "tf.reset_default_graph()\n",
    "sess = tf.Session(config=config)\n",
    "model = logistic_regression(sess=sess, vocab_size=vocab_size, lr=1e-1)\n",
    "train_loss = []\n",
    "train_acc = []\n",
    "test_loss = []\n",
    "test_acc = []\n",
    "\n",
    "for step, batch in enumerate(batches):\n",
    "    x_train, y_train = zip(*batch)\n",
    "    x_train = sentence_to_onehot(x_train, vocab)\n",
    "    acc = model.get_accuracy(x_train, y_train)\n",
    "    l, _ = model.train(x_train, y_train)\n",
    "    train_loss.append(l)\n",
    "    train_acc.append(acc)\n",
    "    \n",
    "    if step % 100 == 0:\n",
    "        test_batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)\n",
    "        for test_batch in test_batches:\n",
    "            x_test, y_test = zip(*test_batch)\n",
    "            x_test = sentence_to_onehot(x_test, vocab)\n",
    "            t_acc = model.get_accuracy(x_test, y_test)\n",
    "            t_loss = model.get_loss(x_test, y_test)\n",
    "            test_loss.append(t_loss)\n",
    "            test_acc.append(t_acc)\n",
    "        print('batch:', '%04d' % step, '\\ntrain loss:', '%.5f' % np.mean(train_loss), \n",
    "              '\\ttest loss:', '%.5f' % np.mean(test_loss))\n",
    "        print('train accuracy:', '%.3f' % np.mean(train_acc), '\\ttest accuracy:', \n",
    "              '%.3f' % np.mean(test_acc), '\\n')\n",
    "        train_loss = []\n",
    "        train_acc = []\n",
    "        test_loss = []\n",
    "        test_acc = []"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis with logistic regression using tfidf encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "IDF = cal_idf(X_train, vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "batches = batch_iter(list(zip(X_train, Y_train)), batch_size=64, num_epochs=15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = tf.ConfigProto()\n",
    "config.gpu_options.allow_growth = True\n",
    "tf.reset_default_graph()\n",
    "sess = tf.Session(config=config)\n",
    "model = logistic_regression(sess=sess, vocab_size=vocab_size, lr=1e-1)\n",
    "train_loss = []\n",
    "train_acc = []\n",
    "test_loss = []\n",
    "test_acc = []\n",
    "\n",
    "for step, batch in enumerate(batches):\n",
    "    x_train, y_train = zip(*batch)\n",
    "    x_train = sentence_to_tfidf(x_train, vocab, IDF)\n",
    "    acc = model.get_accuracy(x_train, y_train)\n",
    "    l, _ = model.train(x_train, y_train)\n",
    "    train_loss.append(l)\n",
    "    train_acc.append(acc)\n",
    "    \n",
    "    if step % 100 == 0:\n",
    "        test_batches = batch_iter(list(zip(X_test, Y_test)), batch_size=64, num_epochs=1)\n",
    "        for test_batch in test_batches:\n",
    "            x_test, y_test = zip(*test_batch)\n",
    "            x_test = sentence_to_tfidf(x_test, vocab, IDF)\n",
    "            t_acc = model.get_accuracy(x_test, y_test)\n",
    "            t_loss = model.get_loss(x_test, y_test)\n",
    "            test_loss.append(t_loss)\n",
    "            test_acc.append(t_acc)\n",
    "        print('batch:', '%04d' % step, '\\ntrain loss:', '%.5f' % np.mean(train_loss), \n",
    "              '\\ttest loss:', '%.5f' % np.mean(test_loss))\n",
    "        print('train accuracy:', '%.3f' % np.mean(train_acc), '\\ttest accuracy:', \n",
    "              '%.3f' % np.mean(test_acc), '\\n')\n",
    "        train_loss = []\n",
    "        train_acc = []\n",
    "        test_loss = []\n",
    "        test_acc = []"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python tensor",
   "language": "python",
   "name": "tensor"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
